import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
from search import HyperbandSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
json 2.0.9 joblib 0.17.0 numpy 1.19.4 imblearn 0.7.0 plotly_express 0.4.1 seaborn 0.11.0 autopep8 1.5.2 pandas 1.1.5
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
def clean_data(dfx):
dfx = dfx.copy()
# from eda we see that gender has no effect
cols_drop = ['customerID','gender']
dfx = dfx.drop(cols_drop,axis=1)
# replace values
dic_replace = [
{'SeniorCitizen': {0:'No', 1:'Yes'}},
{'MultipleLines': {'No phone service':'N/A'}},
{'SeniorCitizen': {'No':'Not_SenCit', 'Yes':'SeniorCitizen'}},
{'Partner': {'No':'No_Partner', 'Yes':'Partner'}},
{'Dependents': {'No':'No_Dependents', 'Yes':'Dependents'}},
{'PaperlessBilling': {'No':'No_PaperlessBill', 'Yes':'PaperlessBill'}},
{'PhoneService': {'No':'No_PhoneService', 'Yes':'PhoneService'}},
{'MultipleLines': {'No':'No_MultiLines', 'Yes':'MultiLines', 'N/A': 'No_PhoneService'}},
{'InternetService': {'No':'No_internet_service'}},
{'OnlineSecurity': {'No':'No_OnlineSecurity', 'Yes':'OnlineSecurity'}},
{'OnlineBackup': {'No':'No_OnlineBackup', 'Yes':'OnlineBackup'}},
{'DeviceProtection': {'No':'No_DeviceProtection', 'Yes':'DeviceProtection'}},
{'TechSupport': {'No':'No_TechSupport', 'Yes':'TechSupport'}},
{'StreamingTV': {'No':'No_StreamingTV', 'Yes':'StreamingTV'}},
{'StreamingMovies': {'No':'No_StreamingMov', 'Yes':'StreamingMov'}}
]
for dic in dic_replace:
dfx = dfx.replace(dic)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],errors='coerce').fillna(0)
# sum of features
dfx['SenCit_Dependents'] = dfx['SeniorCitizen'] + '_' + dfx['Dependents']
dfx['SenCit_Partner'] = dfx['SeniorCitizen'] + '_' + dfx['Partner']
dfx['SenCit_Contract'] = dfx['SeniorCitizen'] + '_' + dfx['Contract']
dfx['SenCit_TechSupport'] = dfx['SeniorCitizen'] + '_' + dfx['TechSupport']
dfx['SenCit_PayMeth'] = dfx['SeniorCitizen'] + '_' + dfx['PaymentMethod']
dfx['Partner_Dependents'] = dfx['Partner'] + '_' + dfx['Dependents']
# aggration features
temp = (dfx.groupby('Contract')['TotalCharges'].agg(['mean'])
.rename({'mean':'Contract_mean_totCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='Contract', how='left')
dfx['Contract_totCharges_diff'] = (dfx['TotalCharges']
- dfx['Contract_mean_totCharges'])
temp = (dfx.groupby('PaymentMethod')['MonthlyCharges'].agg(['mean'])
.rename({'mean':'PayMeth_mean_monthCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='PaymentMethod', how='left')
dfx['PayMeth_monthCharges_diff'] = (dfx['MonthlyCharges']
- dfx['PayMeth_mean_monthCharges'])
multiLines_dict = {'No_PhoneService':0, 'No_MultiLines':1, 'MultiLines':2}
dfx['MultipleLines_Ordinal'] = dfx['MultipleLines'].map(multiLines_dict)
# Ordinal encoding of 'InternetService'
intServ_dict = {'No_internet_service':0, 'DSL':1, 'Fiber_optic':2}
dfx['InternetService_Ordinal'] = dfx['InternetService'].map(intServ_dict)
# Ordinal encoding of 'Contract'
contract_dict = {'Month-to-month':0, 'One_year':1, 'Two_year':2}
dfx['Contract_Ordinal'] = dfx['Contract'].map(contract_dict)
# Drop unnecessary columns that have been encoded
ordinal_drop_cols = ['MultipleLines', 'InternetService', 'Contract']
dfx.drop(ordinal_drop_cols, axis=1, inplace=True)
# Apply one-hot encoder to the relevant columns
cols_ohe = ['SeniorCitizen', 'Partner', 'Dependents',
'PaperlessBilling', 'PhoneService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'PaymentMethod',
'SenCit_Dependents', 'Partner_Dependents', 'SenCit_Partner',
'SenCit_Contract', 'SenCit_TechSupport', 'SenCit_PayMeth']
enc_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_ohe = pd.DataFrame(enc_ohe.fit_transform(dfx[cols_ohe]))
# Replace default column names with more descriptive ones
df_ohe.columns = enc_ohe.get_feature_names(cols_ohe)
# One-hot encoding removed index; put it back
df_ohe.index = dfx.index
# Remove categorical columns (will replace with one-hot encoding)
dfx.drop(cols_ohe, axis=1, inplace=True)
# Add one-hot encoded columns to numerical features
dfx = pd.concat([dfx, df_ohe], axis=1)
# remove columns
cols_drop = ['InternetService_Ordinal','Contract_Ordinal']
dfx = dfx.drop(cols_drop, axis=1)
# remove white spaces from column names
dfx = dfx.rename(columns=lambda x: x.strip())
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
df_Xtrain.sum().sum(), ser_ytrain.sum().sum()
(26621171.299999997, 1495)
df_Xtrain.head(2)
| tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
ser_ytrain.head(2)
0 0 1 0 Name: Churn, dtype: int64
from imblearn.over_sampling import SMOTE # smote needs sklearn 0.23.1
import sklearn
sklearn.__version__
'0.23.1'
smote = SMOTE(sampling_strategy=0.5, random_state=SEED)
df_Xtrain_smote, ser_ytrain_smote = smote.fit_resample(df_Xtrain,ser_ytrain)
# smote = SMOTE(ratio='minority', random_state=SEED)
# df_Xtrain_smote, ser_ytrain_smote = smote.fit_sample(df_Xtrain, ser_ytrain)
Instead of Standard scaling use power transformer (yeo-johnson) for not-uniform distribution
sklearn.preprocessing.PowerTransformer(
method='yeo-johnson', *, standardize=True, copy=True)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
df_Xtrain.head()
| tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 25 | 19.15 | 477.60 | 1370.923131 | -893.323131 | 43.792328 | -24.642328 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 7 | 20.00 | 137.60 | 1370.923131 | -1233.323131 | 67.564819 | -47.564819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 24 | 20.30 | 459.95 | 1370.923131 | -910.973131 | 43.792328 | -23.492328 | 1 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain.columns[df_Xtrain.apply(pd.Series.nunique)>5]
Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_totCharges_diff',
'PayMeth_monthCharges_diff'],
dtype='object')
# Define the columns we wish to transform
cols_scale = ['tenure', 'MonthlyCharges', 'TotalCharges',
'Contract_totCharges_diff',
'PayMeth_monthCharges_diff']
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain)
df_Xtrain_scaled = pd.DataFrame(transformer.transform(df_Xtrain))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest))
#features
df_Xtrain_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
df_Xtrain_scaled.head()
| tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.367334 | 1.355904 | 0.883378 | 0.329235 | 1.553040 | 3683.643192 | 65.801934 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | -0.786426 | -0.056378 | -0.528932 | -0.235047 | -0.268218 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | -0.034567 | -1.531684 | -0.674473 | -0.329237 | -0.901558 | 1370.923131 | 43.792328 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | -0.999430 | -1.501041 | -1.274872 | -0.568686 | -1.626825 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | -0.075469 | -1.490250 | -0.695633 | -0.341573 | -0.864635 | 1370.923131 | 43.792328 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain_scaled.sum().sum(), df_Xtest_scaled.sum().sum()
(13273998.150000809, 3374471.9234355474)
df_Xtrain_scaled.isna().sum().sum(), df_Xtest_scaled.isna().sum().sum()
(0, 0)
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain_smote)
df_Xtrain_smote_scaled = pd.DataFrame(transformer.transform(df_Xtrain_smote))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest)) # Xtest is NEVER oversampled.
#features
df_Xtrain_smote_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds = model.predict(df_Xtest)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds)
array([[1035, 0],
[ 374, 0]])
# help(HyperbandSearchCV)
X = df_Xtrain_smote_scaled
y = ser_ytrain_smote
X = df_Xtrain_scaled
y = ser_ytrain
%%time
import scipy.stats as stats
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
# Define our model
params_fixed = {'dual': False,
'random_state': SEED,
'n_jobs': 1
}
params_hyp = {
'max_iter': stats.randint(200,500),
'solver' : ['lbfgs', 'sag', 'newton-cg'],
'penalty' : ['l2'],
'C' : stats.loguniform(0.01, 10),
}
model = LogisticRegression(**params_fixed)
# Perform Hyperparameter Tuning
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=SEED)
grid = HyperbandSearchCV(model,params_hyp,
resource_param = 'max_iter',
min_iter = 200,
max_iter = 1000,
cv = cv,
scoring = 'roc_auc',
refit = True,
verbose = 0,
random_state = SEED
)
# grid.fit(X, y)
# # Print results of hyperparameter tuning
# print('Best parameters: ', grid.best_params_)
# print('AUC - ROC score: ', grid.best_score_)
CPU times: user 3.21 ms, sys: 253 µs, total: 3.46 ms Wall time: 2.23 ms
params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
# params_best = grid.best_params_
params = params_fixed
params.update(params_best)
model = LogisticRegression(**params)
# Xtrain ==> smote + scaled
# Xtest ==> Xtest only
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds = model.predict(df_Xtest)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds)
array([[384, 651],
[ 55, 319]])
# Xtrain ==> smote + scaled
# Xtest ==> Xtest + smote
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds_smote_scaled = model.predict(df_Xtest_scaled)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds_smote_scaled)
array([[874, 161],
[135, 239]])
# Xtrain ==> orig
# Xtest ==> orig
model.fit(df_Xtrain, ser_ytrain)
ypreds_no_smote = model.predict(df_Xtest)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds_no_smote)
array([[905, 130],
[166, 208]])
ytest = np.array(ser_ytest)
yprobs2d = model.predict_proba(df_Xtest)
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('LR',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support
0 0.87 0.37 0.52 1035
1 0.33 0.85 0.47 374
accuracy 0.50 1409
macro avg 0.60 0.61 0.50 1409
weighted avg 0.73 0.50 0.51 1409
[[384 651]
[ 55 319]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| LR | 0.4989 | 0.3289 | 0.8529 | 0.4747 | 0.6120 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 12 secs